{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from bertopic import BERTopic\n",
        "from sentence_transformers import SentenceTransformer\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from transformers.pipelines import pipeline\n",
        "from umap import UMAP\n",
        "from hdbscan import HDBSCAN\n",
        "from sklearn.feature_extraction.text import CountVectorizer"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 默认创建"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "UMAP(low_memory=False, metric='cosine', min_dist=0.0, n_components=5) cosine 0.0 15 5\n",
            "HDBSCAN(min_cluster_size=10, prediction_data=True) euclidean True\n"
          ]
        }
      ],
      "source": [
        "topic_model = BERTopic()\n",
        "print(topic_model.umap_model, topic_model.umap_model.metric, topic_model.umap_model.min_dist, topic_model.umap_model.n_neighbors, topic_model.umap_model.n_components)\n",
        "print(topic_model.hdbscan_model, topic_model.hdbscan_model.metric, topic_model.hdbscan_model.prediction_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 自行创建"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "UMAP(n_components=5) euclidean 0.1 15 5\n",
            "HDBSCAN(min_cluster_size=20, min_samples=20) euclidean False\n"
          ]
        }
      ],
      "source": [
        "umap_model = UMAP(\n",
        "  n_neighbors=15,\n",
        "  n_components=5,\n",
        ")\n",
        "\n",
        "hdbscan_model = HDBSCAN(\n",
        "  min_cluster_size=20,\n",
        "  min_samples=20,\n",
        ")\n",
        "\n",
        "topic_model = BERTopic(\n",
        "  umap_model=umap_model,\n",
        "  hdbscan_model=hdbscan_model,\n",
        ")\n",
        "\n",
        "print(topic_model.umap_model, topic_model.umap_model.metric, topic_model.umap_model.min_dist, topic_model.umap_model.n_neighbors, topic_model.umap_model.n_components)\n",
        "print(topic_model.hdbscan_model, topic_model.hdbscan_model.metric, topic_model.hdbscan_model.prediction_data)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
